Attribute Information:

Date Time: Each ten minutes. Temperature: Weather Temperature of Tetouan city. Humidity: Weather Humidity of Tetouan city. Wind Speed of Tetouan city. general diffuse flows diffuse flows power consumption of zone 1 of Tetouan city. power consumption of zone 2 of Tetouan city. power consumption of zone 3 of Tetouan city.

rm(list = ls()) #initialization
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(lubridate)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
## 
##     first, last
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(zoo)
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
## 
##     accuracy
# read the real data on July 1 for test
df <- read.csv("Tetuan City power consumption.csv")
colnames(df)
## [1] "DateTime"                  "Temperature"              
## [3] "Humidity"                  "Wind.Speed"               
## [5] "general.diffuse.flows"     "diffuse.flows"            
## [7] "Zone.1.Power.Consumption"  "Zone.2..Power.Consumption"
## [9] "Zone.3..Power.Consumption"
# check for missing values
sum(is.na(df$Zone1))
## [1] 0
colnames(df)[7] <- "Zone1"
colnames(df)[8] <- "Zone2"
colnames(df)[9] <- "Zone3"

EDA

check stationarity

# ts plot and acf plot
var = colnames(df)
var = var[-1]
for(i in var) {       
  ts.plot(df[[i]], ylab=i)
  acf(df[[i]], ylab = paste0(i," ACF"), main="")
}

library(tseries)
for(i in var) {       
  print(adf.test(df[[i]])) # detects stationarity
  print(kpss.test(df[[i]],null = "Trend")) # detects trend stationarity
}
## Warning in adf.test(df[[i]]): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df[[i]]
## Dickey-Fuller = -13.207, Lag order = 37, p-value = 0.01
## alternative hypothesis: stationary
## Warning in kpss.test(df[[i]], null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  df[[i]]
## KPSS Trend = 42.545, Truncation lag parameter = 19, p-value = 0.01
## Warning in adf.test(df[[i]]): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df[[i]]
## Dickey-Fuller = -20.358, Lag order = 37, p-value = 0.01
## alternative hypothesis: stationary
## Warning in kpss.test(df[[i]], null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  df[[i]]
## KPSS Trend = 2.1044, Truncation lag parameter = 19, p-value = 0.01
## Warning in adf.test(df[[i]]): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df[[i]]
## Dickey-Fuller = -7.717, Lag order = 37, p-value = 0.01
## alternative hypothesis: stationary
## Warning in kpss.test(df[[i]], null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  df[[i]]
## KPSS Trend = 16.004, Truncation lag parameter = 19, p-value = 0.01
## Warning in adf.test(df[[i]]): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df[[i]]
## Dickey-Fuller = -35.446, Lag order = 37, p-value = 0.01
## alternative hypothesis: stationary
## Warning in kpss.test(df[[i]], null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  df[[i]]
## KPSS Trend = 4.9499, Truncation lag parameter = 19, p-value = 0.01
## Warning in adf.test(df[[i]]): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df[[i]]
## Dickey-Fuller = -32.848, Lag order = 37, p-value = 0.01
## alternative hypothesis: stationary
## Warning in kpss.test(df[[i]], null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  df[[i]]
## KPSS Trend = 1.3615, Truncation lag parameter = 19, p-value = 0.01
## Warning in adf.test(df[[i]]): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df[[i]]
## Dickey-Fuller = -32.63, Lag order = 37, p-value = 0.01
## alternative hypothesis: stationary
## Warning in kpss.test(df[[i]], null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  df[[i]]
## KPSS Trend = 6.6416, Truncation lag parameter = 19, p-value = 0.01
## Warning in adf.test(df[[i]]): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df[[i]]
## Dickey-Fuller = -31.192, Lag order = 37, p-value = 0.01
## alternative hypothesis: stationary
## Warning in kpss.test(df[[i]], null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  df[[i]]
## KPSS Trend = 2.089, Truncation lag parameter = 19, p-value = 0.01
## Warning in adf.test(df[[i]]): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  df[[i]]
## Dickey-Fuller = -19.704, Lag order = 37, p-value = 0.01
## alternative hypothesis: stationary
## Warning in kpss.test(df[[i]], null = "Trend"): p-value smaller than printed p-
## value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  df[[i]]
## KPSS Trend = 23.651, Truncation lag parameter = 19, p-value = 0.01
date <- seq(from=as.POSIXct("2017-01-01 00:00", format = "%Y-%m-%d %H:%M"), length.out = nrow(df), by = "10 min")
df$DateTime <- date #df(df$DateTime, format="%m/%d/%Y %H:%M")
#attr(df$DateTime, "tzone") <- "Africa/Casablanca"

df$year <- year(df$DateTime)
df$month <- month(df$DateTime)
df$week <- week(df$DateTime)
df$day <- day(df$DateTime)
df$hour <- hour(df$DateTime)
df$minute <- minute(df$DateTime)

train test split

df_xts <- xts(df[,7], date) 

daily

train_daily <- df_xts['2017-01-01/2017-12-29']
test_daily <- df_xts['2017-12-30']
plot(train_daily, xlab = "Time", ylab = "Watt Hours", main = "Zone 1 Power Consumption")

plot(test_daily, xlab = "Time", ylab = "Watt Hours", main = "Zone 1 Power Consumption")

weekly

#df_weekly <- filter(df,week == 1)
#Z1_weekly <- ts(df_weekly$Zone1, frequency=6*24*52, start=c(2017,1))
#plot(Z1_weekly, xlab = "Time", ylab = "Watt Hours", main = "Zone 1 Power Consumption")
train_weekly <- df_xts['2017-01-01/2017-12-23']
test_weekly <- df_xts['2017-12-24/2017-12-30']
plot(train_weekly, xlab = "Time", ylab = "Watt Hours", main = "Zone 1 Power Consumption")

plot(test_weekly, xlab = "Time", ylab = "Watt Hours", main = "Zone 1 Power Consumption")

monthly

train_monthly <- df_xts['2017-01-01/2017-11-30']
test_monthly <- df_xts['2017-12-01/2017-12-30']
plot(train_monthly, xlab = "Time", ylab = "Watt Hours", main = "Zone 1 Power Consumption")

plot(test_monthly, xlab = "Time", ylab = "Watt Hours", main = "Zone 1 Power Consumption")

Seasonal Arima

daily

comp_daily <- decompose(ts(train_weekly,frequency = 6*24))
plot(comp_daily)

fit_daily = stlf(ts(train_daily,frequency = 6*24))
fit_daily$model
## ETS(A,N,N) 
## 
## Call:
##  ets(y = na.interp(x), model = etsmodel, allow.multiplicative.trend = allow.multiplicative.trend) 
## 
##   Smoothing parameters:
##     alpha = 0.9989 
## 
##   Initial states:
##     l = 35146.4894 
## 
##   sigma:  278.7779
## 
##     AIC    AICc     BIC 
## 1156524 1156524 1156551
# forecast
pred_daily <- forecast(fit_daily,h=nrow(test_daily))
autoplot(pred_daily)

#pred_daily <- forecast(fit_daily, h=nrow(test_daily))
pred_df_daily <- data.table(time=index(test_daily), test_val=test_daily[,1], pred_val=pred_daily$mean)
d<- melt(pred_df_daily, id.vars = "time")
p <- ggplot(d, aes(x=time, y=value, color=variable)) + geom_point(size=1) + geom_line()
p + labs(title="Zone 1 Power Consumption: daily",
        x ="Time", y = "Watt Hours")

weekly

comp_weekly <- decompose(ts(train_weekly,frequency = 6*24*52))
plot(comp_weekly)

fit_weekly = stlf(ts(train_weekly,frequency = 6*24*52))
fit_weekly$model
## ETS(A,Ad,N) 
## 
## Call:
##  ets(y = na.interp(x), model = etsmodel, allow.multiplicative.trend = allow.multiplicative.trend) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.6346 
##     phi   = 0.8 
## 
##   Initial states:
##     l = 31972.1007 
##     b = -337.3653 
## 
##   sigma:  394.0297
## 
##     AIC    AICc     BIC 
## 1172130 1172130 1172183
pred_weekly <- forecast(fit_weekly, h=nrow(test_weekly))
autoplot(pred_weekly)

#pred_weekly <- forecast(fit_weekly, h=nrow(test_weekly))
pred_df_weekly <- data.table(time=index(test_weekly), test_val=test_weekly[,1], pred_val=pred_weekly$mean)
d<- melt(pred_df_weekly, id.vars = "time")
p <- ggplot(d, aes(x=time, y=value, color=variable)) + geom_point(size=1) + geom_line()
p + labs(title="Zone 1 Power Consumption: weekly",
        x ="Time", y = "Watt Hours")

monthly

train test split

comp_monthly <- decompose(ts(train_monthly,frequency = 6*24*30))
plot(comp_monthly)

fit_monthly = stlf(ts(train_monthly,frequency = 6*24*30))
fit_monthly$model
## ETS(A,Ad,N) 
## 
## Call:
##  ets(y = na.interp(x), model = etsmodel, allow.multiplicative.trend = allow.multiplicative.trend) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.6058 
##     phi   = 0.8001 
## 
##   Initial states:
##     l = 32492.7206 
##     b = -306.6227 
## 
##   sigma:  408.1339
## 
##     AIC    AICc     BIC 
## 1096795 1096795 1096848
pred_monthly <- forecast(fit_weekly, h=nrow(test_weekly))
autoplot(pred_monthly)

#pred_monthly <- forecast(fit_monthly, h=nrow(test_monthly))
pred_df_monthly <- data.table(time=index(test_monthly), test_val=test_monthly[,1], pred_val=pred_monthly$mean)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 1008 rows but longest item has 4320; recycled with
## remainder.
d<- melt(pred_df_monthly, id.vars = "time")
p <- ggplot(d, aes(x=time, y=value, color=variable)) + geom_point(size=1) + geom_line()
p + labs(title="Zone 1 Power Consumption: monthly",
        x ="Time", y = "Watt Hours")

annual

fit_annual = auto.arima(ts(df$Zone1,frequency = 6*24*365), seasonal = T)
fit_annual
## Series: ts(df$Zone1, frequency = 6 * 24 * 365) 
## ARIMA(3,1,1) 
## 
## Coefficients:
##          ar1      ar2     ar3      ma1
##       1.0600  -0.2510  0.0675  -0.4518
## s.e.  0.0299   0.0183  0.0064   0.0298
## 
## sigma^2 = 202928:  log likelihood = -394643.6
## AIC=789297.2   AICc=789297.2   BIC=789341.6
pred_annual <- forecast(fit_annual, h=nrow(test_daily))
pred_df_annual <- data.table(time=index(df_xts), test_val=test_daily[,1], pred_val=pred_annual$mean)
d<- melt(pred_df_annual, id.vars = "time")
ggplot(d, aes(x=time, y=value, color=variable)) + 
  geom_point(size=1) + 
  geom_line()

smape(xts(pred_df_daily$pred_val, order.by = pred_df_daily$time), test_daily)
## [1] 0.06732304
smape(xts(pred_df_weekly$pred_val, order.by = pred_df_weekly$time), test_weekly)
## [1] 0.07291239
smape(xts(pred_df_monthly$pred_val, order.by = pred_df_monthly$time), test_monthly)
## [1] 0.06944146
smape(pred_annual$mean, test_daily[,1])
## Warning in ae(actual, predicted): Incompatible methods ("Ops.ts", "Ops.xts") for
## "-"
## Warning in mean(ae(actual, predicted)/(abs(actual) + abs(predicted))):
## Incompatible methods ("Ops.ts", "Ops.xts") for "+"
## [1] 0.2071637

Linear regression with ARMA errors (use arima with xreg)

monthly

train <- df[1:(nrow(df)-6*24*30),]
test <- df[(nrow(df)-6*24*30+1):nrow(df),]
temp_xreg <- as.matrix(train[,c("Temperature")])
lm_model <- forecast::auto.arima(train$Zone1, xreg=temp_xreg)
lm_res <- forecast(lm_model,h=nrow(test),xreg = test[,c("Temperature")])$mean
lm_model
## Series: train$Zone1 
## Regression with ARIMA(3,1,1) errors 
## 
## Coefficients:
##          ar1      ar2     ar3      ma1     xreg
##       1.0324  -0.2322  0.0700  -0.4306  32.5107
## s.e.  0.0320   0.0194  0.0068   0.0320  11.2340
## 
## sigma^2 = 210310:  log likelihood = -362976
## AIC=725964   AICc=725964   BIC=726016.7
pred_lm_monthly <- data.table(time=train$DateTime, test_val=test$Zone1, pred_val=lm_res)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 4320 rows but longest item has 48096; recycled with
## remainder.
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 4320 rows but longest item has 48096; recycled with
## remainder.
d <- melt(pred_lm_monthly, id.vars = "time")
p <- ggplot(d, aes(x=time, y=value, color=variable)) + geom_point(size=1) + geom_line()
p + labs(title="Zone 1 Power Consumption: monthly",
        x ="Time", y = "Watt Hours")

smape(lm_res, test$Zone1)
## [1] 0.2082446